# Replication code for: ``Media Attention and Bureaucratic Responsiveness,'' Forthcoming in Journal of Public Administration Research and Theory
# by Erlich, Aaron, Daniel Berliner, Brian Palmer-Rubin, and Benjamin E. Bagozzi.

library(lubridate)
library(MASS)
library(lfe)
library(lmtest)
library(texreg)
library(data.table)
library(doParallel)
library(foreach)


setwd("YOUR FILE PATH HERE")


# Initial request data: 
# This is the already-processed request/response/topics data from previous work, restricted to only the agencies that we have anomaly data for.
# Data sources:
# Berliner, Daniel, Benjamin E. Bagozzi, and Brian Palmer-Rubin. "What information do citizens want? Evidence from one million information requests in Mexico." World Development 109 (2018): 222-235.
# Berliner, Daniel, Benjamin E. Bagozzi, Brian Palmer-Rubin, and Aaron Erlich. "The Political Logic of Government Disclosure: Evidence from Information Requests in Mexico." The Journal of Politics 83(1). 

d<-fread("JPART_replication_data.csv") 


# restrict data to requests for public information only (omit personal data requests, which will be used in placebo test further below)
d<- d[d$TIPOSOLICITUD=="Información Pública",]

# Measure each agency's "count in" and "count out" on each day

d$date0<-ymd(as.character(d$date0))
d$dateR<-ymd(as.character(d$dateR))

udep<-sort(unique(d$dep_short))

dayvector<-(ymd(min(d$date0))-days(1)) + c(0:((12*365)+84)) * days(1)
max(dayvector)
max(d$date0)

ddep0<-as.data.frame(dayvector)
names(ddep0)[1]<-"DAY"
ddep0$YEAR<-as.numeric(substring(as.character(ddep0$DAY),1,4))
ddep0$MONTH<-as.numeric(substring(as.character(ddep0$DAY),6,7))



ddep<-NULL
for(i in 1:length(udep)){
  ddep_temp<-ddep0
  ddep_temp$dep_short<-udep[i]
  ddep_temp$DAY<-as.character(ddep_temp$DAY)
  ddep_temp$daycounter<-c(0:((12*365)+84))+1

  df_count_in<-as.data.frame(table(d$date0[d$dep_short==udep[i]]))
  names(df_count_in)<-c("DAY","count_in")
  df_count_in$DAY<-as.character(df_count_in$DAY)
  ddep_temp<-merge(ddep_temp, df_count_in, by="DAY", all.x=T, all.y=T)

  df_count_out<-as.data.frame(table(d$dateR[d$dep_short==udep[i] & d$RESPUESTA!="Sin respuesta" & d$dateR<=ymd("2015-08-31")])) #Don't count "Sin respuesta" -- these have not yet received a response; and stop at August 31 2015 when data stops.
  names(df_count_out)<-c("DAY","count_out")
  df_count_out$DAY<-as.character(df_count_out$DAY)
  ddep_temp<-merge(ddep_temp, df_count_out, by="DAY", all.x=T, all.y=T)

  ddep_temp$count_in[is.na(ddep_temp$count_in)]<-0
  ddep_temp$count_out[is.na(ddep_temp$count_out)]<-0

  # SSP and COFEPRIS did not exist for the full period of time
  if(udep[i]=="COFEPRIS") ddep_temp<-ddep_temp[ddep_temp$daycounter>=ddep_temp$daycounter[ddep_temp$DAY==min(d$date0[d$dep_short=="COFEPRIS"])],]
  if(udep[i]=="SSP") ddep_temp<-ddep_temp[ddep_temp$daycounter<=ddep_temp$daycounter[ddep_temp$DAY==max(d$date0[d$dep_short=="SSP"])],]

  ddep<-rbind(ddep, ddep_temp)
}




# Data on media anomalies, based on anomaly-detection of media time series for each agency, then hand-coded to assign categories.
# Data and code for anomaly-detection on the basis of agency media coverage available on GitHub at: https://github.com/demotip/jpart_erlich_etal_mexico/
anomaly_data<-read.csv("final_anomalies_forJPART.csv")

# remove the five that are not actual anomalies
anomaly_data<-anomaly_data[anomaly_data$NotRelevant==0,]

# remove the ones with no identifiable theme
anomaly_data<-anomaly_data[anomaly_data$NoTheme==0,]

# don't use anomalies after the request data ends
anomaly_data<-anomaly_data[ymd(anomaly_data$start_date)<=ymd("2015-08-18"),]

anomaly_udep<-as.character(unique(anomaly_data$agency))


# add to agency-day panel

ddep$anom<-0
ddep$anom_negative<-0
ddep$anom_positive<-0
ddep$anom_policy<-0
ddep$anom_personnel<-0
ddep$anom_external<-0
ddep$anom_govfailure<-0
ddep$anom_corruption<-0
ddep$anom_controversy<-0

anomaly_data$start_date<-ymd(as.character(anomaly_data$start_date))
anomaly_data$end_date<-ymd(as.character(anomaly_data$end_date))

for(i in 1:nrow(anomaly_data)){
  tempagency<-anomaly_data$agency[i]
  tempstart<-anomaly_data$start_date[i]
  tempend<-anomaly_data$end_date[i]
  tempseq<-as.character(seq(tempstart, tempend, 1))

  ddep$anom[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$Negative[i]==1) ddep$anom_negative[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$Negative[i]==0) ddep$anom_positive[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$Policy[i]==1) ddep$anom_policy[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$Personnel[i]==1) ddep$anom_personnel[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$External[i]==1) ddep$anom_external[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$GovFailure[i]==1) ddep$anom_govfailure[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$Corruption[i]==1) ddep$anom_corruption[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
  if (anomaly_data$Controversy[i]==1) ddep$anom_controversy[ddep$dep_short==tempagency & ddep$DAY %in% tempseq]<-1
}






# aggregate to weeks
# count in: sum
# count out: sum
# anomalies: proportion of week covered

ddep$WEEK<-floor(ddep$daycounter/7)


weeklydata<-NULL
for(i in 1:length(udep)){
  tempweeklydata<-as.data.frame(cbind(
  tapply(ddep$WEEK[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], head, 1),

  ymd(tapply(ddep$DAY[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], head, 1)),
  tapply(ddep$YEAR[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], head, 1),
  tapply(ddep$MONTH[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], head, 1),

  tapply(ddep$daycounter[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], head, 1),
  tapply(ddep$count_in[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], sum),
  tapply(ddep$count_out[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], sum),  

  tapply(ddep$anom[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_negative[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_positive[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_policy[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_personnel[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_external[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_govfailure[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_corruption[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean),
  tapply(ddep$anom_controversy[ddep$dep_short==udep[i]], ddep$WEEK[ddep$dep_short==udep[i]], mean)
  ))

  tempweeklydata<-cbind(rep(udep[i], nrow(tempweeklydata)) , tempweeklydata)
  weeklydata<-rbind(weeklydata,  tempweeklydata)
}

dim(weeklydata)

names(weeklydata)<-c("dep_short","WEEK","DAY","YEAR","MONTH","daycounter","count_in","count_out",
  "anom","anom_negative","anom_positive","anom_policy","anom_personnel","anom_external","anom_govfailure","anom_corruption","anom_controversy")




# create lagged weekly count_in and count_out

lagger<-function(variable, country, year, laglength){
  
  country<-as.character(country)
  laggedvar<-rep(NA,length(variable))
  
  leadingNAs<-rep(NA,laglength)
  countryshift<-c(leadingNAs, country[1:(length(country)-laglength)])
  
  variableshift<-c(leadingNAs, variable[1:(length(variable)-laglength)])
  
  replacementrefs<-country==countryshift
  replacementrefs[is.na(replacementrefs)==T]<-FALSE
  laggedvar[replacementrefs]<-variableshift[replacementrefs]
  
  laggedvar
  
  } # close lagger function


weeklydata$count_in_l1<-lagger(weeklydata$count_in, weeklydata$dep_short, weeklydata$WEEK, 1)
weeklydata$count_out_l1<-lagger(weeklydata$count_out, weeklydata$dep_short, weeklydata$WEEK, 1)




# Panel models

# note panels begin in 2005 since that is when newspaper data -- used for anomalies -- begins

# panel fixed effects models of request count in
m1<-felm(log(count_in+1)~ log(count_in_l1+1)+ anom  | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])
m2<-felm(log(count_in+1)~anom_negative + anom_positive + log(count_in_l1+1) | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])
m3<-felm(log(count_in+1)~anom_govfailure+anom_corruption+anom_controversy + anom_positive + log(count_in_l1+1) | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])
m4<-felm(log(count_in+1)~anom_policy+anom_personnel+anom_external+anom_govfailure+anom_corruption + log(count_in_l1+1) | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])

texreg(list(m1, m2,  m3, m4), digits=3,  stars = c(0.01, 0.05, 0.1))

# tables with p-values instead of SEs, per journal guidelines
texreg(list(m1, m2,  m3, m4), 
  override.se=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  override.pval=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  digits=3,  stars = c(0.01, 0.05, 0.1))



# panel fixed effects models of response count out
m1<-felm(log(count_out+1)~log(count_in_l1+1)+log(count_out_l1+1) + anom  | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])
m2<-felm(log(count_out+1)~log(count_in_l1+1)+log(count_out_l1+1) + anom_negative + anom_positive | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])
m3<-felm(log(count_out+1)~log(count_in_l1+1)+log(count_out_l1+1) + anom_govfailure+anom_corruption+anom_controversy + anom_positive | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])
m4<-felm(log(count_out+1)~log(count_in_l1+1)+log(count_out_l1+1) + anom_policy+anom_personnel+anom_external+anom_govfailure+anom_corruption  | dep_short + WEEK | 0 | dep_short, data=weeklydata[weeklydata$YEAR>=2005,])

texreg(list(m1, m2,  m3, m4), digits=3,  stars = c(0.01, 0.05, 0.1))

# tables with p-values instead of SEs, per journal guidelines
texreg(list(m1, m2,  m3, m4), 
  override.se=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  override.pval=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  digits=3,  stars = c(0.01, 0.05, 0.1))








########################################################################

# Placebo test w personal data requests

pd<-fread("JPART_replication_data.csv") 

# restrict data to personal data requests only
pd<- pd[pd$TIPOSOLICITUD=="Datos Personales",]


# Create new weekly data for personal-data-requests only, then merge in with original weekly data

udep<-sort(unique(pd$dep_short))
dayvector<-(ymd(min(pd$date0))-days(1)) + c(0:((12*365)+84)) * days(1)
ddep0<-as.data.frame(dayvector)
names(ddep0)[1]<-"DAY"
ddep0$YEAR<-as.numeric(substring(as.character(ddep0$DAY),1,4))
ddep0$MONTH<-as.numeric(substring(as.character(ddep0$DAY),6,7))

ddep_PD<-NULL
for(i in 1:length(udep)){
  ddep_temp<-ddep0
  ddep_temp$dep_short<-udep[i]
  ddep_temp$DAY<-as.character(ddep_temp$DAY)
  ddep_temp$daycounter<-c(0:((12*365)+84))+1

  df_count_in<-as.data.frame(table(pd$date0[pd$dep_short==udep[i]]))
  names(df_count_in)<-c("DAY","count_in")
  df_count_in$DAY<-as.character(df_count_in$DAY)
  ddep_temp<-merge(ddep_temp, df_count_in, by="DAY", all.x=T, all.y=T)

  df_count_out<-as.data.frame(table(pd$dateR[pd$dep_short==udep[i] & pd$RESPUESTA!="Sin respuesta" & pd$dateR<=ymd("2015-08-31")])) #Don't count "Sin respuesta" -- these have not yet received a response; and stop at August 31 2015 when data stops.
  names(df_count_out)<-c("DAY","count_out")
  df_count_out$DAY<-as.character(df_count_out$DAY)
  ddep_temp<-merge(ddep_temp, df_count_out, by="DAY", all.x=T, all.y=T)

  ddep_temp$count_in[is.na(ddep_temp$count_in)]<-0
  ddep_temp$count_out[is.na(ddep_temp$count_out)]<-0

  # SSP and COFEPRIS did not exist for the full period of time
  if(udep[i]=="COFEPRIS") ddep_temp<-ddep_temp[ddep_temp$daycounter>=ddep_temp$daycounter[ddep_temp$DAY==min(pd$date0[pd$dep_short=="COFEPRIS"])],]
  if(udep[i]=="SSP") ddep_temp<-ddep_temp[ddep_temp$daycounter<=ddep_temp$daycounter[ddep_temp$DAY==max(pd$date0[pd$dep_short=="SSP"])],]

  ddep_PD<-rbind(ddep_PD, ddep_temp)
}

# aggregate to weeks
ddep_PD$WEEK<-floor(ddep_PD$daycounter/7)

weeklydata_PD<-NULL
for(i in 1:length(udep)){
  tempweeklydata<-as.data.frame(cbind(
  tapply(ddep_PD$WEEK[ddep_PD$dep_short==udep[i]], ddep_PD$WEEK[ddep_PD$dep_short==udep[i]], head, 1),
  tapply(ddep_PD$daycounter[ddep_PD$dep_short==udep[i]], ddep_PD$WEEK[ddep_PD$dep_short==udep[i]], head, 1),
  tapply(ddep_PD$count_in[ddep_PD$dep_short==udep[i]], ddep_PD$WEEK[ddep_PD$dep_short==udep[i]], sum),
  tapply(ddep_PD$count_out[ddep_PD$dep_short==udep[i]], ddep_PD$WEEK[ddep_PD$dep_short==udep[i]], sum)  
  ))

  tempweeklydata<-cbind(rep(udep[i], nrow(tempweeklydata)) , tempweeklydata)
  weeklydata_PD<-rbind(weeklydata_PD,  tempweeklydata)
}

dim(weeklydata_PD)

names(weeklydata_PD)<-c("dep_short","WEEK","daycounter","count_in_PD","count_out_PD")

weeklydata_PD$count_in_PD_l1<-lagger(weeklydata_PD$count_in_PD, weeklydata_PD$dep_short, weeklydata_PD$WEEK, 1)
weeklydata_PD$count_out_PD_l1<-lagger(weeklydata_PD$count_out_PD, weeklydata_PD$dep_short, weeklydata_PD$WEEK, 1)


weeklydata_merge1<-merge(weeklydata, weeklydata_PD, by=c("dep_short","WEEK"), all.x=T, all.y=F)


# panel fixed effects models of request count in
m1<-felm(log(count_in_PD+1)~ log(count_in_PD_l1+1)+ anom  | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])
m2<-felm(log(count_in_PD+1)~anom_negative + anom_positive + log(count_in_PD_l1+1) | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])
m3<-felm(log(count_in_PD+1)~anom_govfailure+anom_corruption+anom_controversy + anom_positive + log(count_in_PD_l1+1) | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])
m4<-felm(log(count_in_PD+1)~anom_policy+anom_personnel+anom_external+anom_govfailure+anom_corruption + log(count_in_PD_l1+1) | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])

texreg(list(m1, m2,  m3, m4), digits=3,  stars = c(0.01, 0.05, 0.1))

# tables with p-values instead of SEs, per journal guidelines
texreg(list(m1, m2,  m3, m4), 
  override.se=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  override.pval=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  digits=3,  stars = c(0.01, 0.05, 0.1))


# panel fixed effects models of response count out
m1<-felm(log(count_out_PD+1)~log(count_in_PD_l1+1)+log(count_out_PD_l1+1) + anom  | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])
m2<-felm(log(count_out_PD+1)~log(count_in_PD_l1+1)+log(count_out_PD_l1+1) + anom_negative + anom_positive | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])
m3<-felm(log(count_out_PD+1)~log(count_in_PD_l1+1)+log(count_out_PD_l1+1) + anom_govfailure+anom_corruption+anom_controversy + anom_positive | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])
m4<-felm(log(count_out_PD+1)~log(count_in_PD_l1+1)+log(count_out_PD_l1+1) + anom_policy+anom_personnel+anom_external+anom_govfailure+anom_corruption  | dep_short + WEEK | 0 | dep_short, data=weeklydata_merge1[weeklydata_merge1$YEAR>=2005,])

texreg(list(m1, m2,  m3, m4), digits=3,  stars = c(0.01, 0.05, 0.1))

# tables with p-values instead of SEs, per journal guidelines
texreg(list(m1, m2,  m3, m4), 
  override.se=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  override.pval=lapply(list(m1, m2,  m3, m4), function(x) summary(x)$coef[,4]), 
  digits=3,  stars = c(0.01, 0.05, 0.1))







########################################################################

# appendix models of anomaly onset

# in original agency-day panel, make anomaly onset variable
ddep$anom_l1<-lagger(ddep$anom, ddep$dep_short, ddep$daycounter, 1)
ddep$anom_onset <- as.numeric(ddep$anom==1 & ddep$anom_l1==0)

# variable for post-onset anom dates to exclude from sample
ddep$anom_excludingonset <- as.numeric(ddep$anom==1 &  ddep$anom_onset==0)


# make 7 day rolling sum of incoming requests
ddep$count_in_7daysum <- NA

for(i in 1:length(udep)){
    ddep$count_in_7daysum[ddep$dep_short==udep[i]] <- rollsum(ddep$count_in[ddep$dep_short==udep[i]], k=7, align="right", fill=0)
}

# lag these so as not to count requests on day of
ddep$count_in_7daysum_l1<-lagger(ddep$count_in_7daysum, ddep$dep_short, ddep$daycounter, 1)

# days since last onset
ddep$days_since_last_onset <- 0

for(i in 1:length(udep)){
    for(j in 1:length(ddep$DAY[ddep$dep_short==udep[i]])){
        anom_temp<-ddep$anom_onset[ddep$dep_short==udep[i] & ddep$DAY==ddep$DAY[ddep$dep_short==udep[i]][j]]
        previous_temp<-ddep$days_since_last_onset[ddep$dep_short==udep[i] & ddep$DAY==ddep$DAY[ddep$dep_short==udep[i]][j-1]] + 1
        if(length(previous_temp)==0) previous_temp<-0
        # counter of days increments from previous day +1
        if(anom_temp==0) ddep$days_since_last_onset[ddep$dep_short==udep[i] & ddep$DAY==ddep$DAY[ddep$dep_short==udep[i]][j]] <- previous_temp
        # BUT if onset=1 then reset to 0
        if(anom_temp==1) ddep$days_since_last_onset[ddep$dep_short==udep[i] & ddep$DAY==ddep$DAY[ddep$dep_short==udep[i]][j]] <- 0
    }
print(i)
}

ddep$days_since_last_onset_l1<-lagger(ddep$days_since_last_onset, ddep$dep_short, ddep$daycounter, 1)
ddep$log_days_since_last_onset<-log(ddep$days_since_last_onset_l1 + 1)


onsetmodel1<- glm(anom_onset ~ 
  log(count_in_7daysum_l1+1) + log_days_since_last_onset,
  , data=ddep[ddep$anom_excludingonset==0 & ddep$YEAR>=2005,], family=binomial(link=logit))

onsetmodel2<- glm(anom_onset ~ 
  log(count_in_7daysum_l1+1) + days_since_last_onset_l1 + I(days_since_last_onset_l1^2) + I(days_since_last_onset_l1^3) ,
  , data=ddep[ddep$anom_excludingonset==0 & ddep$YEAR>=2005,], family=binomial(link=logit))

onsetmodel3<- glm(anom_onset ~ 
  log(count_in_7daysum_l1+1) + log_days_since_last_onset + as.factor(dep_short),
  , data=ddep[ddep$anom_excludingonset==0 & ddep$YEAR>=2005,], family=binomial(link=logit))

onsetmodel4<- glm(anom_onset ~ 
  log(count_in_7daysum_l1+1) + log_days_since_last_onset + as.factor(YEAR) + as.factor(MONTH) + as.factor(dep_short),
  , data=ddep[ddep$anom_excludingonset==0 & ddep$YEAR>=2005,], family=binomial(link=logit))



# tables with p-values instead of SEs, per journal guidelines

texreg(list(onsetmodel1, onsetmodel2, onsetmodel3, onsetmodel4), 
  override.se=lapply(list(onsetmodel1, onsetmodel2, onsetmodel3, onsetmodel4), function(x) summary(x)$coef[,4]), 
  override.pval=lapply(list(onsetmodel1, onsetmodel2, onsetmodel3, onsetmodel4), function(x) summary(x)$coef[,4]), 
  digits=3,  stars = c(0.01, 0.05, 0.1))









########################################################################

# Matched group analyses


library(timeDate)
holidays<-read.csv("NationalHolidays.csv", header=T)
holidays$holiday<-mdy(as.character(holidays$holiday))

dayvector<-ymd(d$date0[1]) + c(0:((12*365)+100)) * days(1)

# this will be used to remove national holidays from the counts of days-to-response
master_calendar<-dayvector[isBizday(as.timeDate(dayvector))]
master_calendar<-master_calendar[!master_calendar%in% holidays$holiday]


# prepare request data
d$dateR<-ymd(d$dateR) #response date
d$timeD<-as.numeric(d$dateR-d$date0) # time-to-response: response date minus request date
d$ref_FOLIO<-NA #this will references the FOLIO id of each "treated" request, and will allow FE for each comparison group
d$ref_anom<-NA #anomaly reference
d$anom_exposed<-NA #treatment variable

IDlist<-anomaly_data$ID



# Functions to parallelize


create_matched_sample<-function(anom_data, req_data, IDtemp, master_calendar, k){

  set.seed(k) # set random seed

  # function to count working days -- needs to be created "inside" the function in order for parallelization to work
      WDfun<-function(date1,date2){
      length(master_calendar[master_calendar>date1 & master_calendar<=date2])
    }
    WDfunV<-Vectorize(WDfun)

  tempdep <- anom_data$agency[anom_data$ID==IDtemp]
  tempdate <- ymd(anom_data$start_date[anom_data$ID==IDtemp])

  # requests that are "in queue" at the agency on the anomaly onset date
  temp_requests<-req_data[req_data$dep_short==tempdep & req_data$date0<tempdate & req_data$dateR>tempdate,]

  temp_requests$days_elapsed1 <- WDfunV(temp_requests$date0, tempdate)
  temp_requests$days_untilreponse1 <- WDfunV(tempdate, temp_requests$dateR)

  temp_requests$ref_FOLIO<-paste(IDtemp, temp_requests$FOLIO, sep="_")
  temp_requests$ref_anom<-rep(IDtemp, length(temp_requests$anom_exposed)) 
  
  # main treatment variable -- this will be zero for the "comparison" requests from non-anomaly dates.
  temp_requests$anom_exposed<-rep(1,length(temp_requests$anom_exposed)) 

  # cap this set of "in queue" requests at 100; sample if larger than that
  if(nrow(temp_requests)>100) temp_requests<-temp_requests[sample(x=(1:nrow(temp_requests)), size=100, replace=F ),]

  # set of all anomaly days for that agency (to exclude from potential queue dates)
  temp_anom_days<-NULL
  if(nrow(anom_data[anom_data$agency==tempdep,c("start_date","end_date")])>1) temp_anom_days<-ymd(as.character( do.call("c",apply(anom_data[anom_data$agency==tempdep,c("start_date","end_date")], 1, function(x){seq(ymd(x[1]), ymd(x[2]), 1)}) ) ))


  # select potential comparison dates in a window before and after the reference anomaly, then select requests that are "in queue" on those dates
  # use a range from 2.5 years - 0.5 years before, and 0.5 years - 2.5 years after; thus excluding the immediate year surrounding the reference anomaly
  # also exclude any other anomaly dates for that same agency
  # sample ten comparison requests each from those "in queue" on the "before" dates and the "after" dates

  potential_queuedates<- seq(tempdate-days(round(365*2.5,0)), tempdate-days(round(365*0.5,0)), 1)
  potential_queuedates<-potential_queuedates[!potential_queuedates %in% temp_anom_days]

  sampled_queuedates_past<-sample(x=potential_queuedates, size=10, replace=F)
  potential_queuedates_future<- seq(tempdate+days(round(365*0.5,0)), tempdate+days(round(365*2.5,0)), 1) 
  potential_queuedates_future<-potential_queuedates_future[!potential_queuedates_future %in% temp_anom_days]

  sampled_queuedates_future<-sample(x=potential_queuedates_future, size=10, replace=F)
  sampled_queuedates<-c(sampled_queuedates_past, sampled_queuedates_future)

  temp_formatching<-NULL

  for(p in 1:length(sampled_queuedates)){
      queue_on_sampled_date<-req_data[req_data$dep_short==tempdep & req_data$date0<sampled_queuedates[p] & req_data$dateR>sampled_queuedates[p],] 
      queue_on_sampled_date$queue_sample_date<-rep(sampled_queuedates[p] , nrow(queue_on_sampled_date))
      temp_formatching<-rbind(temp_formatching, queue_on_sampled_date)
    }
  # this yields temp_formatching, which contains the set of sampled requests from the relevant queues on the sampled comparison dates

  # use the pool of potential comparison requests to create matched groups of comparison requests for each actual-anomaly request
  temp_mergedata<-NULL

  if(nrow(temp_requests)>0) for(j in 1:nrow(temp_requests)){
      temp_matchset<-temp_formatching[temp_formatching$MaxTopic==temp_requests$MaxTopic[j],]

      temp_matchset$ref_FOLIO<-rep( paste(IDtemp, temp_requests$FOLIO[j], sep="_") , length(temp_matchset$ref_FOLIO))
      temp_matchset$ref_anom<-rep(IDtemp, length(temp_matchset$ref_FOLIO))
      temp_matchset$anom_exposed<-rep(0,length(temp_matchset$anom_exposed))

      temp_matchset$days_elapsed1<- WDfunV(temp_matchset$date0, temp_matchset$queue_sample_date)

      # for real anomaly, DV is response_date - onset_date
      # for comparison, DV is response_date - sampled_onset_date
      temp_matchset$days_untilreponse1<- WDfunV(temp_matchset$queue_sample_date, temp_matchset$dateR)

      # further restrict to same number of days elapsed
      temp_matchset<-temp_matchset[temp_matchset$days_elapsed1==temp_requests$days_elapsed1[j],]

      # to ensure each comparison group is always same size, sample 10 from each
     if(nrow(temp_matchset)>0) temp_mergedata<-rbind(temp_mergedata, temp_matchset[sample(x=(1:nrow(temp_matchset)), size=10, replace=T ),] )
    }


  # merge together
  if(NROW(temp_mergedata)>0)  newdataTEMP<-rbind(temp_requests, temp_mergedata[,queue_sample_date:=NULL]) # removing the "queue_sample_date" variable as it is not present in temp_requests

  newdataTEMP
}






anom_and_models_function<-function(anom_data, req_data, IDtemplist, master_calendar){

  # create matched sample dataset using function create_matched_sample()
  matched_sample_data<-foreach(i=1:length(IDtemplist), .combine='rbind') %do% create_matched_sample(anom_data, req_data, IDtemplist[i], master_calendar, k)

  # create variables for anomaly theme or type
  matched_sample_data$Policy<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Policy==1] )
  matched_sample_data$Personnel<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Personnel==1] )
  matched_sample_data$External<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$External==1] )
  matched_sample_data$GovFailure<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$GovFailure==1] )
  matched_sample_data$Corruption<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Corruption==1] )
  matched_sample_data$NoTheme<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$NoTheme==1] )
  matched_sample_data$Negative<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Negative==1] )
  matched_sample_data$Positive<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Negative==0] )
  matched_sample_data$Controversy<-as.numeric(matched_sample_data$Negative==1 & matched_sample_data$GovFailure==0 & matched_sample_data$Corruption==0)


  # table 1 models

  m1<-felm(log(days_untilreponse1+1) ~ anom_exposed
    | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m2<-felm(log(days_untilreponse1+1) ~ anom_exposed+logwordcount+readability +attachment+ MEDIOENTRADA+ legalism_score+punctuation_score+log(agency_workload) 
    | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m3<-felm(badresponse ~ anom_exposed
   | ref_FOLIO | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m4<-felm(badresponse ~ anom_exposed+logwordcount+readability +attachment+ MEDIOENTRADA+ legalism_score+punctuation_score+log(agency_workload) 
   | ref_FOLIO | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 



  outputvector1<-c(
  coef(m1),
  coef(m2),
  coef(m3),
  coef(m4),

  sqrt(diag(vcov(m1))),
  sqrt(diag(vcov(m2))),
  sqrt(diag(vcov(m3))),
  sqrt(diag(vcov(m4))),

  summary(m1)$r.square,
  summary(m2)$r.square,
  summary(m3)$r.square,
  summary(m4)$r.square,

  m1$N,
  m2$N,
  m3$N,
  m4$N
    )




  # table 2 models

  m5<-felm(log(days_untilreponse1+1) ~ anom_exposed:Positive + anom_exposed:Negative
   | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m6<-felm(badresponse ~ anom_exposed:Positive + anom_exposed:Negative
   | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m7<-felm(log(days_untilreponse1+1) ~ anom_exposed:Positive + anom_exposed:GovFailure + anom_exposed:Corruption + anom_exposed:Controversy
   | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m8<-felm(badresponse ~ anom_exposed:Positive + anom_exposed:GovFailure + anom_exposed:Corruption + anom_exposed:Controversy
   | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m9<-felm(log(days_untilreponse1+1) ~ anom_exposed:Policy + anom_exposed:Personnel + anom_exposed:External + anom_exposed:GovFailure + anom_exposed:Corruption  
   | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

  m10<-felm(badresponse ~ anom_exposed:Policy + anom_exposed:Personnel + anom_exposed:External + anom_exposed:GovFailure + anom_exposed:Corruption  
   | ref_FOLIO  | 0 | ref_FOLIO,
  data=matched_sample_data[matched_sample_data$NoTheme==0,]) 




  outputvector2<-c(
  coef(m5),
  coef(m6),
  coef(m7),
  coef(m8),
  coef(m9),
  coef(m10),

  sqrt(diag(vcov(m5))),
  sqrt(diag(vcov(m6))),
  sqrt(diag(vcov(m7))),
  sqrt(diag(vcov(m8))),
  sqrt(diag(vcov(m9))),
  sqrt(diag(vcov(m10))),


  summary(m5)$r.square,
  summary(m6)$r.square,
  summary(m7)$r.square,
  summary(m8)$r.square,
  summary(m9)$r.square,
  summary(m10)$r.square,


  m5$N,
  m6$N,
  m7$N,
  m8$N,
  m9$N,
  m10$N
    )



  outputvector<-c(
  k, 
  outputvector1,
  outputvector2 
    )


  outputvector
}




##### IMPORTANT NOTE #####
# note that the following took roughly *one week* to complete on a 4-core Apple desktop computer

timestamp()
cores<-detectCores() #setup parallel processors
cl <- makeCluster(cores[1]-1) #use one less than number available
registerDoParallel(cl)

parallel_output<- foreach(k=1:1000, .combine='cbind', .packages = c('foreach', 'lfe', 'lubridate', 'lmtest', 'MASS', 'data.table')) %dopar% anom_and_models_function(anom_data=anomaly_data,  req_data=d[d$RESPUESTA!="Sin respuesta",],  IDtemplist=IDlist, master_calendar=master_calendar)
write.csv( parallel_output, "matched_samples_results_output.csv", row.names=T)

#stop cluster
stopCluster(cl)
timestamp()





# read back in results

res_full<-read.csv("matched_samples_results_output.csv")

res_full<-res_full[,2:ncol(res_full)] # this omits the first column which is just the variable names
res_means<-apply(res_full, 1, mean)

tab1_m1_coef<-res_means[2]
tab1_m1_se<-res_means[11]

tab1_m2_coef<-res_means[3:10]
tab1_m2_se<-res_means[12:19]

tab1_m3_coef<-res_means[20]
tab1_m3_se<-res_means[29]

tab1_m4_coef<-res_means[21:28]
tab1_m4_se<-res_means[30:37]

tab1_R<-res_means[38:41]
tab1_N<-res_means[42:45]


tab2_m1_coef<-res_means[46:47]
tab2_m1_se<-res_means[68:69]

tab2_m2_coef<-res_means[48:49]
tab2_m2_se<-res_means[70:71]

tab2_m3_coef<-res_means[50:53]
tab2_m3_se<-res_means[72:75]

tab2_m4_coef<-res_means[54:57]
tab2_m4_se<-res_means[76:79]

tab2_m5_coef<-res_means[58:62]
tab2_m5_se<-res_means[80:84]

tab2_m6_coef<-res_means[63:67]
tab2_m6_se<-res_means[85:89]

tab2_R<-res_means[90:95]
tab2_N<-res_means[96:101]




# need to re-run once to get a new draw of model results, just to create the results tables. 
# (these results will be over-written by the saved bootstrapped results)

k<-1
anom_data<-anomaly_data
req_data<-d[d$RESPUESTA!="Sin respuesta",]
IDtemplist<-IDlist
master_calendar<-master_calendar

# create matched sample dataset
matched_sample_data<-foreach(i=1:length(IDtemplist), .combine='rbind') %do% create_matched_sample(anom_data, req_data, IDtemplist[i], master_calendar, k)


# create variables for anomaly theme or type
matched_sample_data$Policy<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Policy==1] )
matched_sample_data$Personnel<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Personnel==1] )
matched_sample_data$External<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$External==1] )
matched_sample_data$GovFailure<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$GovFailure==1] )
matched_sample_data$Corruption<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Corruption==1] )
matched_sample_data$NoTheme<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$NoTheme==1] )
matched_sample_data$Negative<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Negative==1] )
matched_sample_data$Positive<-as.numeric(matched_sample_data$ref_anom %in% anomaly_data$ID[anomaly_data$Negative==0] )
matched_sample_data$Controversy<-as.numeric(matched_sample_data$Negative==1 & matched_sample_data$GovFailure==0 & matched_sample_data$Corruption==0)



# table 1 models

m1<-felm(log(days_untilreponse1+1) ~ anom_exposed
  | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m2<-felm(log(days_untilreponse1+1) ~ anom_exposed+logwordcount+readability +attachment+ MEDIOENTRADA+ legalism_score+punctuation_score+log(agency_workload) 
  | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m3<-felm(badresponse ~ anom_exposed
 | ref_FOLIO | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m4<-felm(badresponse ~ anom_exposed+logwordcount+readability +attachment+ MEDIOENTRADA+ legalism_score+punctuation_score+log(agency_workload) 
 | ref_FOLIO | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 


# table 2 models

m5<-felm(log(days_untilreponse1+1) ~ anom_exposed:Positive + anom_exposed:Negative
 | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m6<-felm(badresponse ~ anom_exposed:Positive + anom_exposed:Negative
 | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m7<-felm(log(days_untilreponse1+1) ~ anom_exposed:Positive + anom_exposed:GovFailure + anom_exposed:Corruption + anom_exposed:Controversy
 | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m8<-felm(badresponse ~ anom_exposed:Positive + anom_exposed:GovFailure + anom_exposed:Corruption + anom_exposed:Controversy
 | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m9<-felm(log(days_untilreponse1+1) ~ anom_exposed:Policy + anom_exposed:Personnel + anom_exposed:External + anom_exposed:GovFailure + anom_exposed:Corruption  
 | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 

m10<-felm(badresponse ~ anom_exposed:Policy + anom_exposed:Personnel + anom_exposed:External + anom_exposed:GovFailure + anom_exposed:Corruption  
 | ref_FOLIO  | 0 | ref_FOLIO,
data=matched_sample_data[matched_sample_data$NoTheme==0,]) 




texreg(list(m1, m2, m3, m4), 
  override.coef=list(tab1_m1_coef, tab1_m2_coef, tab1_m3_coef, tab1_m4_coef),
  override.se=list(tab1_m1_se, tab1_m2_se, tab1_m3_se, tab1_m4_se),
  override.pvalues=list(
    2*pt(-abs( tab1_m1_coef/tab1_m1_se ),df= m1$df),
    2*pt(-abs( tab1_m2_coef/tab1_m2_se ),df= m2$df),
    2*pt(-abs( tab1_m3_coef/tab1_m3_se ),df= m3$df),
    2*pt(-abs( tab1_m4_coef/tab1_m4_se ),df= m4$df)),
  digits=3,  stars = c(0.01, 0.05, 0.1))
cat(as.numeric(round(tab1_R, 3)), sep="   &   ")
cat(as.numeric(round(tab1_N, 0)), sep="   &   ")

# tables with p-values instead of SEs, per journal guidelines
tab1_m1_pval<-2*pt(-abs( tab1_m1_coef/tab1_m1_se ),df= m1$df)
tab1_m2_pval<-2*pt(-abs( tab1_m2_coef/tab1_m2_se ),df= m2$df)
tab1_m3_pval<-2*pt(-abs( tab1_m3_coef/tab1_m3_se ),df= m3$df)
tab1_m4_pval<-2*pt(-abs( tab1_m4_coef/tab1_m4_se ),df= m4$df)

texreg(list(m1, m2, m3, m4), 
  override.coef=list(tab1_m1_coef, tab1_m2_coef, tab1_m3_coef, tab1_m4_coef),
  override.se=list(tab1_m1_pval, tab1_m2_pval, tab1_m3_pval, tab1_m4_pval),
  override.pvalues=list(tab1_m1_pval, tab1_m2_pval, tab1_m3_pval, tab1_m4_pval),
  digits=3,  stars = c(0.01, 0.05, 0.1))
cat(as.numeric(round(tab1_R, 3)), sep="   &   ")
cat(as.numeric(round(tab1_N, 0)), sep="   &   ")









texreg(list(m5, m6, m7, m8, m9, m10), 
  override.coef=list(tab2_m1_coef, tab2_m2_coef, tab2_m3_coef, tab2_m4_coef, tab2_m5_coef, tab2_m6_coef),
  override.se=list(tab2_m1_se, tab2_m2_se, tab2_m3_se, tab2_m4_se, tab2_m5_se, tab2_m6_se),
  override.pvalues=list(
    2*pt(-abs( tab2_m1_coef/tab2_m1_se ),df= m5$df),
    2*pt(-abs( tab2_m2_coef/tab2_m2_se ),df= m6$df),
    2*pt(-abs( tab2_m3_coef/tab2_m3_se ),df= m7$df),
    2*pt(-abs( tab2_m4_coef/tab2_m4_se ),df= m8$df),
    2*pt(-abs( tab2_m5_coef/tab2_m5_se ),df= m9$df),
    2*pt(-abs( tab2_m6_coef/tab2_m6_se ),df= m10$df)),
  digits=3,  stars = c(0.01, 0.05, 0.1))
cat(as.numeric(round(tab2_R, 3)), sep="   &   ")
cat(as.numeric(round(tab2_N, 0)), sep="   &   ")


# tables with p-values instead of SEs, per journal guidelines
tab2_m1_pval<-2*pt(-abs( tab2_m1_coef/tab2_m1_se ),df= m5$df)
tab2_m2_pval<-2*pt(-abs( tab2_m2_coef/tab2_m2_se ),df= m6$df)
tab2_m3_pval<-2*pt(-abs( tab2_m3_coef/tab2_m3_se ),df= m7$df)
tab2_m4_pval<-2*pt(-abs( tab2_m4_coef/tab2_m4_se ),df= m8$df)
tab2_m5_pval<-2*pt(-abs( tab2_m5_coef/tab2_m5_se ),df= m9$df)
tab2_m6_pval<-2*pt(-abs( tab2_m6_coef/tab2_m6_se ),df= m10$df)

texreg(list(m5, m6, m7, m8, m9, m10), 
  override.coef=list(tab2_m1_coef, tab2_m2_coef, tab2_m3_coef, tab2_m4_coef, tab2_m5_coef, tab2_m6_coef),
  override.se=list(tab2_m1_pval, tab2_m2_pval, tab2_m3_pval, tab2_m4_pval, tab2_m5_pval, tab2_m6_pval),
  override.pvalues=list(tab2_m1_pval, tab2_m2_pval, tab2_m3_pval, tab2_m4_pval, tab2_m5_pval, tab2_m6_pval),
  digits=3,  stars = c(0.01, 0.05, 0.1))
cat(as.numeric(round(tab2_R, 3)), sep="   &   ")
cat(as.numeric(round(tab2_N, 0)), sep="   &   ")















